# -*- coding:utf8 -*-
# @Time : 2022/10/20 4:30 下午
# @Author : WanJie Wu

import json

def chinese_snli_clean(src_path: str, dst_path):
    """
    The Stanford Natural Language Inference Corpus
    基于Stanford大学创建的自然语言推理(SNLI)数据集翻译的中文数据集
    """
    clean_data = []
    with open(src_path, 'r', encoding="utf8") as f_read:
        for index, line in enumerate(f_read.readlines()):
            line = json.loads(line.strip())
            sent1 = line.get('sentence1')
            sent2 = line.get('sentence2')
            label = line.get('gold_label')
            if not sent1 or not sent2 or not label:
                continue
            clean_data.append(line)
    with open(dst_path, 'w', encoding="utf8") as writer:
        for _data in clean_data:
            writer.write(json.dumps(_data, ensure_ascii=False) + "\n")


def convert_snli():
    ori_base_path = "/data/sdv1/wuwanjie/datasets/cnsd/SNLI/origin"
    target_base_path = "/data/sdv1/wuwanjie/datasets/cnsd/SNLI/target"
    dev_src, dev_dst = f'{ori_base_path}/cnsd_snli_v1.0.dev.jsonl', f'{target_base_path}/dev.txt'
    test_src, test_dst = f'{ori_base_path}/cnsd_snli_v1.0.test.jsonl', f'{target_base_path}/test.txt'
    train_src, train_dst = f'{ori_base_path}/cnsd_snli_v1.0.train.jsonl', f'{target_base_path}/train.txt'

    chinese_snli_clean(train_src, train_dst)
    chinese_snli_clean(test_src, test_dst)
    chinese_snli_clean(dev_src, dev_dst)


if __name__ == '__main__':
    convert_snli()
